/*
Basic analysis applied to other disease areas
*/


/*
Multiple Sclerosis: done separately to account for issues like biosimilars (Extavia => same generic name, different product name vs. Betaseron)
*/



///////////
// SETUP //
///////////

clear 
use "redbook"

keep ndcnum gennme mastfrm prodnme //mstfmds

gen gennme_temp = gennme
replace gennme = upper(gennme)
replace prodnme = upper(prodnme)

merge m:1 prodnme using "ms_chronic_prodnme"
// DEBUG: keep if _merge == 2
keep if _merge == 1 | _merge == 3 
gen matched = (_merge == 3)
drop _merge

merge m:1 gennme using "ms_chronic_gennme"
tab matched _merge
keep if matched == 1 | _merge == 3 // adds on some variants of regular drugs
drop _merge

// see if there are forms we might want indicators for
tab mastfrm


// might need to compress further
replace gennme = gennme_temp
merge m:1 gennme using generic_name_mapping
keep if _merge == 3
drop _merge

keep ndcnum generic_id mastfrm
compress
save redbook_multiple_sclerosis, replace


clear
set more off
use enrolid genind ndcnum svcdate using ccaed1996

// filter data: this has changed
merge m:1 ndcnum using "redbook_multiple_sclerosis", keepus(generic_id) keep(mat)
drop _merge

// start year necessary for generating first use indicator (relevant for reduced form)
merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
drop _merge



forvalues j=1997/2013 {
	append using ccaed`j', keep(enrolid genind ndcnum svcdate)
	
	drop generic_id start_year
	
	merge m:1 ndcnum using "redbook_multiple_sclerosis", keepus(generic_id) keep(mat)
	drop _merge
	
	// filter data (keep size down)
	merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
	drop _merge
	
}


gen indic_id = 6


///////////////
// END SETUP //
///////////////

//keep if indic_id == 5 // start by looking at cholesterol (but code general enough to handle several)

// figure out when a user first starts using anything in the class
sort enrolid indic_id svcdate generic // go with brand if multiple of the same day
egen first_use = tag(enrolid indic_id)

// label new subscription (first and six months after the start of the sample)
replace first_use = 0 if first_use == 1 & svcdate < mdy(6,1,start_year) // MODIFIED


// NEW: also create list of first use dates
preserve

keep if first_use == 1
keep enrolid svcdate indic_id
rename svcdate first_use_date

save user_first_multiple_sclerosis, replace

restore




// ADDED
merge m:1 ndcnum using "redbook", keepus(prodnme)
keep if _merge == 3
drop _merge



gen year = year(svcdate)
gen quarter = qofd(svcdate)
tab quarter

gen prescriptions = 1

// Crude: get drug with most prescriptions in a given year
// 1 entry per enrollee, year, and indication
// additional variables: first year using drugs in some area (allows us to see if drug used in entry year)
collapse (sum) prescriptions (max) first_use (first) year generic_id, by(enrolid indic_id quarter prodnme)


// patch for dealing with multiple prescriptions (still want to record that this quarter was the first
egen first_use_temp = max(first_use), by(enrolid indic_id quarter)
replace first_use = first_use_temp
drop first_use_temp

// problematic step: will drop first use if first use involves two drugs
gsort enrolid indic_id quarter - prescriptions prodnme // drugid for multiple prescription cases (need to clean this up later)

egen t = tag(enrolid indic_id quarter)
keep if t == 1 // keep top prescription for the quarter: drop entries indicate possible combination therapy
drop t

sort enrolid indic_id quarter

compress

save user_quarter_choice_multiple_sclerosis, replace 



clear
use user_quarter_choice_multiple_sclerosis


// BASIC: LOOK AT IMPACT OF FIRST CHOICE ON LATER CHOICE 
sort enrolid quarter
egen first_q = min(quarter), by(enrolid)
gen first_choice = prodnme if quarter == first_q
replace first_choice = first_choice[_n-1] if enrolid == enrolid[_n-1]


gen period = quarter - first_q
gen chose_initial = (prodnme == first_choice)


drop first_choice





/////////////////////////////////////////////
// IDENTIFIED ANALYSIS: USING ENTRY EVENTS //
/////////////////////////////////////////////


// need name for overlap analysis
merge m:1 generic_id using generic_name_mapping
keep if _merge == 3
drop _merge


rename generic_id curr_generic_id
rename gennme curr_gennme
rename prodnme curr_prodnme

// user is in the dataset only based on first use
joinby enrolid indic_id using "user_treatment_indicators_multiple_sclerosis"


// filter out irrelevant entries
// 1) not either treatment or control for that entry event
keep if treat == 1 | control == 1
replace treat = 0 if control == 1

//replace treat_alt = 0 if control_alt == 1

// 2) quarter came before entry quarter
gen entry_quarter = qofd(entry_date)
gen rel_quart = quarter - entry_quarter
//keep if rel_quart >= 0

// A) generate the outcomes
gen chose_entry_drug = (curr_prodnme == prodnme)
tab chose_entry_drug


// B) analyze


// Does this need correcting?
gen initial = chose_entry_drug if quarter == first_q  // rel_quart == 0 (old code; this is probably cleaner; use their first choice, otherwise some not there in period 0)
egen initial_choice = max(initial), by(enrolid entry_num) // unnecessary? maybe only 1 entry_num per enrolid




/////////////////////
// plot ols vs. IV //
/////////////////////
// http://www.stata.com/meeting/germany14/abstracts/materials/de14_jann.pdf, slide 27


// NEW: add coefplot
matrix ols = J(18, 3, .)
matrix coln ols = coeff l95 u95
matrix rown ols = 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

rename initial_choice initial_choice_temp
gen initial_choice = 1

forvalues i=3(1)20 {
	display "Offset: `i' quarters"
	reg chose_initial initial_choice if period == `i', nocon r
	matrix b= e(b)
	matrix v = e(V)
	
	matrix ols[`i'-2,1] = b[1,1], b[1,1] - sqrt(v[1,1]) * 1.96, b[1,1] + sqrt(v[1,1]) * 1.96
	
}

replace initial_choice = initial_choice_temp

matrix iv = J(18, 3, .)
matrix coln iv = coeff l95 u95
matrix rown iv = 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

forvalues i=3(1)20 {
	display "Offset: `i' quarters"
	ivregress 2sls chose_entry_drug (initial_choice=treat) if rel_quart == `i', r

	matrix b= e(b)
	matrix v = e(V)
	
	matrix iv[`i'-2,1] = b[1,1], b[1,1] - sqrt(v[1,1]) * 1.96, b[1,1] + sqrt(v[1,1]) * 1.96
}

coefplot (matrix(ols[,1]), ci((ols[,2] ols[,3]))) (matrix(iv[,1]), ci((iv[,2] iv[,3]))), ///
vertical nooff xtitle("Quarter") ytitle("Estimate (Fraction of Users)") 

graph save "coefplot_iv_ols_comparison_multiple_sclerosis", replace
graph export "coefplot_iv_ols_comparison_multiple_sclerosis.pdf", replace






/*
Remaining areas are done in a loop

2 => diabetes
3 => hypertension
4 => COPD

Steps
1) find entry dates
2) create experimental users
3) run same analysis

*/




// FIND ENTRY DATES

set more off
clear
gen generic_id = 0
save entry_dates_all, replace


forvalues j=1996/2013 {
	clear
	use genind ndcnum svcdate pay awp using ccaed`j'

	merge m:1 ndcnum using "compressed_redbook_w_indications", keep(mat) keepus(generic_id indic_id xr)
	drop _merge

	// only cholesterol
	//keep if indic_id == 5

	destring genind, replace force
	keep if genind >= 1 & genind <= 5

	gen generic = (genind == 4) | (genind == 5)
	drop genind

	// sort generic_id generic xr
	egen drug_id = group(generic_id generic xr) // NOTE: drug ID not comparable across datasets


	// NEW FILTER
	gen week = week(svcdate)
	gen dummy = 1
	egen s = sum(dummy), by(drug_id week)
	drop if s < 10
	drop s dummy week


	// is there a way to weed out sketchy data?
	// could drop if entries < 10 or something
	gen entries = 1
	collapse (min) svcdate (first) indic_id generic_id generic xr (count) entries, by(drug_id)
	drop if entries < 50 // in the year (weeds out incorrect generic indicators)

	rename svcdate entry_date
	drop drug_id


	// DEBUGGING
	preserve
	merge m:1 generic_id using "generic_name_mapping"
	keep if _merge == 3
	drop _merge

	order gennme
	save entry_dates_all_`j', replace // for debugging purposes (esp. stray entries for zocor, pravachol generic)
	restore


	append using entry_dates_all

	collapse (min) entry_date (first) indic_id (sum) entries, by(generic_id generic xr)

	save entry_dates_all, replace

}



// as a readable resouce for us

merge m:1 generic_id using "generic_name_mapping"
keep if _merge == 3
drop _merge

sort indic_id entry_date

save entry_dates_all_long, replace


// 3) Reshape to set up analysis in various areas: indication id => date 1, drug 1, type 1; etc.
clear
use entry_dates_all

drop if entry_date < mdy(3,1,1996) // two months burn-in period


// number the entry events
sort indication entry_date

gen seq = 1
replace seq = seq + seq[_n-1] if indic_id == indic_id[_n-1]

egen max_seq = max(seq), by(indic_id)

drop entries
reshape wide entry_date generic_id generic xr, i(indic_id) j(seq)

keep if indic_id != 5

save entry_dates_other, replace





// FIND EXPERIMENTAL USERS



forvalues m = 2/4 {


// 1) Set up the individual-level data
clear
use user_first_other_dates

keep if indic_id == `m'

rename first_use_date first_date

merge m:1 indic_id using entry_dates_other
keep if _merge == 3 // should all merge
drop _merge

summ max_seq

local loop_end = r(max)

forvalues i=1/`loop_end' {
	gen diff = first_date - entry_date`i' 
	gen treat`i' = 1 if diff >= 0 & diff <= 180
	gen control`i' = 1 if diff < 0 & diff >= -180
	
	//gen treat_alt`i' = 1 if diff >= 0 & diff <= 60
	//gen control_alt`i' = 1 if diff < 0 & diff >= -60
	
	//gen treat_alt_short`i' = 1 if diff >= 0 & diff <= 30
	//gen control_alt_short`i' = 1 if diff < 0 & diff >= -30
	
	tab treat`i'
	tab control`i'
	drop diff
}

// in the form user id, indication, entering drug characteristics, treatment/control
display _N

keep enrolid treat* control* generic_id* generic* xr* entry_date*

// reshape to long, drop irrelevant entries
reshape long treat control generic_id generic xr entry_date, i(enrolid) j(entry_num)

drop if treat == . & control == . // drop a bunch of entries

gen indic_id = `m'

// summarize
duplicates tag enrolid, gen(t)
tab t
drop t

gen k = runiform()
sort enrolid k
egen t = tag(enrolid)
keep if t == 1
drop t

//save user_treatment_indicators_cholesterol_10pct_sample, replace
save user_treatment_indicators_`m', replace

}



// RUN ANALYSIS


forvalues m=2/4 {


set more off
clear
use user_quarter_choice_other

// focus on the current indication
keep if indic_id == `m'



// BASIC: LOOK AT IMPACT OF FIRST CHOICE ON LATER CHOICE 
egen first_q = min(quarter), by(enrolid)
gen first_choice_temp = generic_id if quarter == first_q
egen first_choice = max(first_choice_temp), by(enrolid)

gen first_generic_temp = generic if quarter == first_q
egen first_generic = max(first_generic_temp), by(enrolid)

gen first_xr_temp = xr if quarter == first_q
egen first_xr = max(first_xr_temp), by(enrolid)


gen period = quarter - first_q
gen chose_initial = (generic_id == first_choice) & (generic == first_generic) & (xr == first_xr)


drop first_choice_temp first_choice





/////////////////////////////////////////////
// IDENTIFIED ANALYSIS: USING ENTRY EVENTS //
/////////////////////////////////////////////



// need name for overlap analysis
merge m:1 generic_id using generic_name_mapping
keep if _merge == 3
drop _merge


rename generic_id curr_generic_id
rename gennme curr_gennme
rename generic curr_generic
rename xr curr_xr
drop drugid

// user is in the dataset only based on first use
joinby enrolid indic_id using "user_treatment_indicators_`m'"


// filter out irrelevant entries
// 1) not either treatment or control for that entry event
keep if treat == 1 | control == 1
replace treat = 0 if control == 1

//replace treat_alt = 0 if control_alt == 1

// 2) quarter came before entry quarter
gen entry_quarter = qofd(entry_date)
gen rel_quart = quarter - entry_quarter
//keep if rel_quart >= 0

// A) generate the outcomes
gen chose_entry_drug = (curr_generic_id == generic_id) & (curr_generic == generic) & (curr_xr == xr)
tab chose_entry_drug


// B) analyze
gen treat_generic = treat * generic 
gen treat_xr = treat * xr
// is stickiness bigger for generic choice? (doesn't have to be)


// Does this need correcting?
gen initial = chose_entry_drug if quarter == first_q  // rel_quart == 0 (old code; this is probably cleaner; use their first choice, otherwise some not there in period 0)
egen initial_choice = max(initial), by(enrolid entry_num) // unnecessary? maybe only 1 entry_num per enrolid


gen initial_generic_temp = generic if quarter == first_q
egen initial_generic_user = max(initial_generic_temp), by(enrolid)



/////////////////////
// plot ols vs. IV //
/////////////////////
// http://www.stata.com/meeting/germany14/abstracts/materials/de14_jann.pdf, slide 27




// NEW: add coefplot
matrix ols = J(18, 3, .)
matrix coln ols = coeff l95 u95
matrix rown ols = 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

rename initial_choice initial_choice_temp
gen initial_choice = 1

forvalues i=3(1)20 {
	display "Offset: `i' quarters"
	reg chose_initial initial_choice if period == `i' & initial_generic_user == 0, nocon r
	matrix b= e(b)
	matrix v = e(V)
	
	matrix ols[`i'-2,1] = b[1,1], b[1,1] - sqrt(v[1,1]) * 1.96, b[1,1] + sqrt(v[1,1]) * 1.96
	
}

replace initial_choice = initial_choice_temp

matrix iv = J(18, 3, .)
matrix coln iv = coeff l95 u95
matrix rown iv = 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

forvalues i=3(1)20 {
	display "Offset: `i' quarters"
	ivregress 2sls chose_entry_drug (initial_choice=treat) if rel_quart == `i' & generic == 0, r

	matrix b= e(b)
	matrix v = e(V)
	
	matrix iv[`i'-2,1] = b[1,1], b[1,1] - sqrt(v[1,1]) * 1.96, b[1,1] + sqrt(v[1,1]) * 1.96
}

coefplot (matrix(ols[,1]), ci((ols[,2] ols[,3]))) (matrix(iv[,1]), ci((iv[,2] iv[,3]))), ///
vertical nooff xtitle("Quarter") ytitle("Estimate (Fraction of Users)") 

graph save "coefplot_iv_ols_comparison_`m'", replace
graph export "coefplot_iv_ols_comparison_`m'.pdf", replace

}




